//go:build amd64
// +build amd64

package common

import (
	"golang.org/x/sys/cpu"
)

// ZetasAVX2 contains all ζ used in NTT (like the Zetas array), but also
// the values int16(zeta * 62209) for each zeta, which is used in
// Montgomery reduction.  There is some duplication and reordering as
// compared to Zetas to make it more convenient for use with AVX2.
var ZetasAVX2 = [...]int16{
	// level 1: int16(Zetas[1]*62209) and Zetas[1]
	31499, 2571,

	// level 2
	//
	// int16(Zetas[2]*62209), Zetas[2], int16(Zetas[3]*62209), Zetas[3]
	14746, 2970, 788, 1812,

	// level 3, like level 2.
	13525, 1493, -12402, 1422, 28191, 287, -16694, 202,

	0, 0, // padding

	// layer 4. offset: 1*16
	//
	// The precomputed multiplication and zetas are grouped by 16 at a
	// time as used in the set of butterflies, etc.
	-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
	27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
	3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
	622, 622, 622, 622, 622, 622, 622, 622,
	-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
	-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
	1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
	182, 182, 182, 182, 182, 182, 182, 182,
	10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
	1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
	962, 962, 962, 962, 962, 962, 962, 962,
	2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
	-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
	31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
	1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
	1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,

	// layer 5. offset: 9*16
	-5827, -5827, -5827, -5827, 17364, 17364, 17364, 17364,
	-26360, -26360, -26360, -26360, -29057, -29057, -29057, -29057,
	573, 573, 573, 573, 2004, 2004, 2004, 2004,
	264, 264, 264, 264, 383, 383, 383, 383,
	5572, 5572, 5572, 5572, -1102, -1102, -1102, -1102,
	21439, 21439, 21439, 21439, -26241, -26241, -26241, -26241,
	2500, 2500, 2500, 2500, 1458, 1458, 1458, 1458,
	1727, 1727, 1727, 1727, 3199, 3199, 3199, 3199,
	-28072, -28072, -28072, -28072, 24313, 24313, 24313, 24313,
	-10532, -10532, -10532, -10532, 8800, 8800, 8800, 8800,
	2648, 2648, 2648, 2648, 1017, 1017, 1017, 1017,
	732, 732, 732, 732, 608, 608, 608, 608,
	18427, 18427, 18427, 18427, 8859, 8859, 8859, 8859,
	26676, 26676, 26676, 26676, -16162, -16162, -16162, -16162,
	1787, 1787, 1787, 1787, 411, 411, 411, 411,
	3124, 3124, 3124, 3124, 1758, 1758, 1758, 1758,

	// layer 6. offset: 17*16
	-5689, -5689, -6516, -6516, 1497, 1497, 30967, 30967,
	-23564, -23564, 20179, 20179, 20711, 20711, 25081, 25081,
	1223, 1223, 652, 652, 2777, 2777, 1015, 1015,
	2036, 2036, 1491, 1491, 3047, 3047, 1785, 1785,
	-12796, -12796, 26617, 26617, 16065, 16065, -12441, -12441,
	9135, 9135, -649, -649, -25986, -25986, 27837, 27837,
	516, 516, 3321, 3321, 3009, 3009, 2663, 2663,
	1711, 1711, 2167, 2167, 126, 126, 1469, 1469,
	19884, 19884, -28249, -28249, -15886, -15886, -8898, -8898,
	-28309, -28309, 9076, 9076, -30198, -30198, 18250, 18250,
	2476, 2476, 3239, 3239, 3058, 3058, 830, 830,
	107, 107, 1908, 1908, 3082, 3082, 2378, 2378,
	13427, 13427, 14017, 14017, -29155, -29155, -12756, -12756,
	16832, 16832, 4312, 4312, -24155, -24155, -17914, -17914,
	2931, 2931, 961, 961, 1821, 1821, 2604, 2604,
	448, 448, 2264, 2264, 677, 677, 2054, 2054,

	// layer 7. offset: 25*16
	-334, 11182, -11477, 13387, -32226, -14233, 20494, -21655,
	-27738, 13131, 945, -4586, -14882, 23093, 6182, 5493,
	2226, 430, 555, 843, 2078, 871, 1550, 105,
	422, 587, 177, 3094, 3038, 2869, 1574, 1653,
	32011, -32502, 10631, 30318, 29176, -18741, -28761, 12639,
	-18485, 20100, 17561, 18525, -14430, 19529, -5275, -12618,
	3083, 778, 1159, 3182, 2552, 1483, 2727, 1119,
	1739, 644, 2457, 349, 418, 329, 3173, 3254,
	-31183, 20297, 25435, 2146, -7382, 15356, 24392, -32384,
	-20926, -6279, 10946, -14902, 24215, -11044, 16990, 14470,
	817, 1097, 603, 610, 1322, 2044, 1864, 384,
	2114, 3193, 1218, 1994, 2455, 220, 2142, 1670,
	10336, -21497, -7933, -20198, -22501, 23211, 10907, -17442,
	31637, -23859, 28644, -20257, 23998, 7757, -17422, 23132,
	2144, 1799, 2051, 794, 1819, 2475, 2459, 478,
	3221, 3021, 996, 991, 958, 1869, 1522, 1628,

	// layer 1 inverse
	23132, -17422, 7757, 23998, -20257, 28644, -23859, 31637,
	-17442, 10907, 23211, -22501, -20198, -7933, -21497, 10336,
	1628, 1522, 1869, 958, 991, 996, 3021, 3221,
	478, 2459, 2475, 1819, 794, 2051, 1799, 2144,
	14470, 16990, -11044, 24215, -14902, 10946, -6279, -20926,
	-32384, 24392, 15356, -7382, 2146, 25435, 20297, -31183,
	1670, 2142, 220, 2455, 1994, 1218, 3193, 2114,
	384, 1864, 2044, 1322, 610, 603, 1097, 817,
	-12618, -5275, 19529, -14430, 18525, 17561, 20100, -18485,
	12639, -28761, -18741, 29176, 30318, 10631, -32502, 32011,
	3254, 3173, 329, 418, 349, 2457, 644, 1739,
	1119, 2727, 1483, 2552, 3182, 1159, 778, 3083,
	5493, 6182, 23093, -14882, -4586, 945, 13131, -27738,
	-21655, 20494, -14233, -32226, 13387, -11477, 11182, -334,
	1653, 1574, 2869, 3038, 3094, 177, 587, 422,
	105, 1550, 871, 2078, 843, 555, 430, 2226,

	// layer 2 inverse
	-17914, -17914, -24155, -24155, 4312, 4312, 16832, 16832,
	-12756, -12756, -29155, -29155, 14017, 14017, 13427, 13427,
	2054, 2054, 677, 677, 2264, 2264, 448, 448,
	2604, 2604, 1821, 1821, 961, 961, 2931, 2931,
	18250, 18250, -30198, -30198, 9076, 9076, -28309, -28309,
	-8898, -8898, -15886, -15886, -28249, -28249, 19884, 19884,
	2378, 2378, 3082, 3082, 1908, 1908, 107, 107,
	830, 830, 3058, 3058, 3239, 3239, 2476, 2476,
	27837, 27837, -25986, -25986, -649, -649, 9135, 9135,
	-12441, -12441, 16065, 16065, 26617, 26617, -12796, -12796,
	1469, 1469, 126, 126, 2167, 2167, 1711, 1711,
	2663, 2663, 3009, 3009, 3321, 3321, 516, 516,
	25081, 25081, 20711, 20711, 20179, 20179, -23564, -23564,
	30967, 30967, 1497, 1497, -6516, -6516, -5689, -5689,
	1785, 1785, 3047, 3047, 1491, 1491, 2036, 2036,
	1015, 1015, 2777, 2777, 652, 652, 1223, 1223,

	// layer 3 inverse
	-16162, -16162, -16162, -16162, 26676, 26676, 26676, 26676,
	8859, 8859, 8859, 8859, 18427, 18427, 18427, 18427,
	1758, 1758, 1758, 1758, 3124, 3124, 3124, 3124,
	411, 411, 411, 411, 1787, 1787, 1787, 1787,
	8800, 8800, 8800, 8800, -10532, -10532, -10532, -10532,
	24313, 24313, 24313, 24313, -28072, -28072, -28072, -28072,
	608, 608, 608, 608, 732, 732, 732, 732,
	1017, 1017, 1017, 1017, 2648, 2648, 2648, 2648,
	-26241, -26241, -26241, -26241, 21439, 21439, 21439, 21439,
	-1102, -1102, -1102, -1102, 5572, 5572, 5572, 5572,
	3199, 3199, 3199, 3199, 1727, 1727, 1727, 1727,
	1458, 1458, 1458, 1458, 2500, 2500, 2500, 2500,
	-29057, -29057, -29057, -29057, -26360, -26360, -26360, -26360,
	17364, 17364, 17364, 17364, -5827, -5827, -5827, -5827,
	383, 383, 383, 383, 264, 264, 264, 264,
	2004, 2004, 2004, 2004, 573, 573, 573, 573,

	// layer 4 inverse
	31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
	-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
	1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
	1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
	1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
	10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
	2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
	962, 962, 962, 962, 962, 962, 962, 962,
	-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
	-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
	182, 182, 182, 182, 182, 182, 182, 182,
	1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
	27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
	-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
	622, 622, 622, 622, 622, 622, 622, 622,
	3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,

	// layer 5 inverse
	-16694, 202, 28191, 287, -12402, 1422, 13525, 1493,

	// layer 6 inverse
	788, 1812, 14746, 2970,

	// layer 7 inverse
	31499, 2571,
}

// Sets p to a + b.  Does not normalize coefficients.
func (p *Poly) Add(a, b *Poly) {
	if cpu.X86.HasAVX2 {
		addAVX2(
			(*[N]int16)(p),
			(*[N]int16)(a),
			(*[N]int16)(b),
		)
	} else {
		p.addGeneric(a, b)
	}
}

// Sets p to a - b.  Does not normalize coefficients.
func (p *Poly) Sub(a, b *Poly) {
	if cpu.X86.HasAVX2 {
		subAVX2(
			(*[N]int16)(p),
			(*[N]int16)(a),
			(*[N]int16)(b),
		)
	} else {
		p.subGeneric(a, b)
	}
}

// Executes an in-place forward "NTT" on p.
//
// Assumes the coefficients are in absolute value ≤q.  The resulting
// coefficients are in absolute value ≤7q.  If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity of the NTT)
// if the input is in regular form, then the result is also in regular form.
// The order of coefficients will be "tangled". These can be put back into
// their proper order by calling Detangle().
func (p *Poly) NTT() {
	if cpu.X86.HasAVX2 {
		nttAVX2((*[N]int16)(p))
	} else {
		p.nttGeneric()
	}
}

// Executes an in-place inverse "NTT" on p and multiply by the Montgomery
// factor R.
//
// Requires coefficients to be in "tangled" order, see Tangle().
// Assumes the coefficients are in absolute value ≤q.  The resulting
// coefficients are in absolute value ≤q.  If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity)
// if the input is in regular form, then the result is also in regular form.
func (p *Poly) InvNTT() {
	if cpu.X86.HasAVX2 {
		invNttAVX2((*[N]int16)(p))
	} else {
		p.invNTTGeneric()
	}
}

// Sets p to the "pointwise" multiplication of a and b.
//
// That is: InvNTT(p) = InvNTT(a) * InvNTT(b).  Assumes a and b are in
// Montgomery form.  Products between coefficients of a and b must be strictly
// bounded in absolute value by 2¹⁵q.  p will be in Montgomery form and
// bounded in absolute value by 2q.
//
// Requires a and b to be in "tangled" order, see Tangle().  p will be in
// tangled order as well.
func (p *Poly) MulHat(a, b *Poly) {
	if cpu.X86.HasAVX2 {
		mulHatAVX2(
			(*[N]int16)(p),
			(*[N]int16)(a),
			(*[N]int16)(b),
		)
	} else {
		p.mulHatGeneric(a, b)
	}
}

// Puts p into the right form to be used with (among others) InvNTT().
func (p *Poly) Tangle() {
	if cpu.X86.HasAVX2 {
		tangleAVX2((*[N]int16)(p))
	}

	// When AVX2 is not available, we use the standard order.
}

// Puts p back into standard form.
func (p *Poly) Detangle() {
	if cpu.X86.HasAVX2 {
		detangleAVX2((*[N]int16)(p))
	}

	// When AVX2 is not available, we use the standard order.
}

// Almost normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q}.
func (p *Poly) BarrettReduce() {
	if cpu.X86.HasAVX2 {
		barrettReduceAVX2((*[N]int16)(p))
	} else {
		p.barrettReduceGeneric()
	}
}

// Normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q-1}.
func (p *Poly) Normalize() {
	if cpu.X86.HasAVX2 {
		normalizeAVX2((*[N]int16)(p))
	} else {
		p.normalizeGeneric()
	}
}
